In [1]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt

Cross validation with pipelines


In [2]:
from sklearn.datasets import load_digits
from sklearn.cross_validation import train_test_split

digits = load_digits()
X_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target, random_state=0)

Without pipelines


In [3]:
# DON'T do this!
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.cross_validation import cross_val_score

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

cross_val_score(SVC(), X_train_scaled, y_train)


Out[3]:
array([ 0.98230088,  0.97772829,  0.96188341])

With pipelines, no contamination!


In [4]:
from sklearn.pipeline import make_pipeline

pipe = make_pipeline(StandardScaler(), SVC())
cross_val_score(pipe, X_train, y_train)


Out[4]:
array([ 0.98230088,  0.97772829,  0.9529148 ])


In [ ]: